TopHat mode: G
library('ballgown')
## Loading required package: methods
##
## Attaching package: 'ballgown'
##
## The following object is masked from 'package:base':
##
## structure
library('GenomicRanges')
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library('usefulstuff')
library('forecast')
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: timeDate
## This is forecast 5.8
library('ggplot2')
library('plyr')
##
## Attaching package: 'plyr'
##
## The following object is masked from 'package:IRanges':
##
## desc
##
## The following object is masked from 'package:S4Vectors':
##
## rename
Loads the coverage and splits it by gene for each sample. Then it calculates the correlation (and \(R^2\)) between the Geuvadis sample coverage and each of the simulated scenarios including simply drawing numbers from the negative binomial (aka, skipping alignment). For each Geuvadis gene-sample, an ARIMA model is fitted, then one more for each simulated scenario using the simulated coverage as a predictor. The coefficient for the predictor is extracted as well as its corresponding p-value. Similar numbers are calculated between replicates of the same simulation scenario.
## Load coverage data
load(file.path(tophat, 'ex_regCov.Rdata'))
## Define exons
gff <- gffReadGR(file.path('..', 'select_genes', 'twenty_genes.gtf'))
genes <- split(gff, strip_quotes(getAttributeField(as.character(gff$group),
'gene_id')))
exons <- unlist(reduce(genes))
## Split by gene
geneCov <- split(ex_regCov, names(exons))
geneCov <- mapply(function(x, y) {
res <- do.call(rbind, x)
## Save the bases
rownames(res) <- unlist(mapply(seq, start(exons[names(exons) == y]), end(exons[names(exons) == y])))
return(res)
}, geneCov, names(geneCov), SIMPLIFY = FALSE)
## Split by sample
geu_samples <- c('NA06985', 'NA12144', 'NA12776', 'NA18858', 'NA20542',
'NA20772', 'NA20815')
set.seed(20150221)
sampleCov <- lapply(geneCov, function(x) {
result <- lapply(seq_len(length(geu_samples)), function(i) {
idx <- grepl(paste0(geu_samples[i], '|s', i), colnames(x))
res <- x[, idx]
colnames(res) <- c('Geuvadis', 'd1', 'd2', 'r1', 'r2')
m <- mean(res$Geuvadis)
res$b1a <- rnbinom(nrow(res), size = 1, mu = m)
res$b1b <- rnbinom(nrow(res), size = 1, mu = m)
res$b6a <- rnbinom(nrow(res), size = 6, mu = m)
res$b6b <- rnbinom(nrow(res), size = 6, mu = m)
return(res)
})
names(result) <- geu_samples
return(result)
})
fitArima <- function(x, y = 'Geuvadis', order, d, auto = FALSE) {
if(auto) {
f <- auto.arima(d[, y], xreg = d[, x])
order <- arimaorder(f)
} else {
f <- tryCatch(arima(d[, y], xreg = d[, x], order = order), error = function(e) { return(list(error = TRUE))})
}
if(!is.null(f$error)) return(list(order = NA, coef = NA, z = NA))
z <- coef(f)["d[, x]"] / sqrt(f$var.coef["d[, x]", "d[, x]"])
return(list(order = paste(order, collapse = '-'), coef = coef(f)["d[, x]"], z = z))
}
rmsd <- function(x, y = 'Geuvadis', d) {
sqrt(mean( (d[, y] / max(d[, y]) - d[, x] / max(d[, x]))^2 ))
}
corr <- function(x, y = 'Geuvadis', d) {
cor(d[, y], d[, x])
}
sets <- c('d1', 'd2', 'r1', 'r2', 'b1a', 'b1b', 'b6a', 'b6b')
reps_a <- sets[rep(c(TRUE, FALSE), 4)]
reps_b <- sets[rep(c(FALSE, TRUE), 4)]
## Save as it takes about 20-30 mins to compute
if(file.exists(file.path(tophat, 'pred_cov.Rdata'))) {
load(file.path(tophat, 'pred_cov.Rdata'))
} else {
pred_cov <- lapply(names(sampleCov), function(gene) {
res <- lapply(names(sampleCov[[gene]]), function(samp) {
d <- sampleCov[[gene]][[samp]]
## Arima
ord <- arimaorder(auto.arima(d$Geuvadis))
info <- c(lapply(sets, fitArima, order = ord, d = d), mapply(fitArima, x = reps_b, y = reps_a, MoreArgs = list(d = d, order = ord), SIMPLIFY = FALSE))
orders <- unlist(sapply(info, '[', 'order'))
coefs <- unlist(sapply(info, '[', 'coef'))
pvals <- pnorm(abs(unlist(sapply(info, '[', 'z'))), lower = FALSE) * 2
## RMSD
RMSD <- c(sapply(sets, rmsd, d = d), mapply(rmsd, reps_b, reps_a, MoreArgs = list(d = d)))
## Cor and R2
CORR <- c(sapply(sets, corr, d = d), mapply(corr, reps_b, reps_a, MoreArgs = list(d = d)))
data.frame(set = c(sets, paste(reps_a, reps_b, sep = '-')), order = orders, coef = coefs, pval = pvals, rmsd = RMSD, cor = CORR, R2 = CORR^2, type = c(rep(c('d', 'r', 'b1', 'b6'), each = 2), 'd', 'r', 'b1', 'b6'), group = rep(c('Geuvadis', 'replicate'), c(8, 4)), gene = gene, sample = samp, row.names = NULL, stringsAsFactors = FALSE)
})
do.call(rbind, res)
})
pred_cov <- do.call(rbind, pred_cov)
pred_cov$set <- factor(pred_cov$set, levels = c(sets, paste(reps_a, reps_b, sep = '-')))
## Significant pval?
pred_cov$sig <- pred_cov$pval <= 0.05
## Extract ARIMA order info
pred_cov$ar <- as.integer(substr(pred_cov$order, 1, 1))
pred_cov$i <- as.integer(substr(pred_cov$order, 3, 3))
pred_cov$ma <- as.integer(substr(pred_cov$order, 5, 5))
save(pred_cov, file = file.path(tophat, 'pred_cov.Rdata'))
}
Explore which ARIMA models were selected for each gene-sample scenario.
## Types of models selected
sort(table(pred_cov$order), decreasing = TRUE)
##
## 1-1-1 2-1-2 0-1-2 3-1-2 0-1-4 1-1-2 1-2-1 2-2-1 3-1-3 4-1-3 5-2-0 0-1-1
## 300 120 108 84 60 60 60 60 60 60 60 48
## 1-1-3 4-1-2 0-1-0 0-2-1 2-1-0 2-2-3 3-1-5 3-2-1 0-1-5 1-1-4 2-1-3 3-2-2
## 48 48 36 36 36 36 36 36 24 24 24 24
## 4-1-4 4-2-1 0-2-2 0-2-3 1-1-5 2-1-4 2-1-5 2-2-4 3-1-1 3-2-3 4-1-5 5-1-3
## 24 24 12 12 12 12 12 12 12 12 12 12
## 5-1-4 5-2-5
## 12 12
table(pred_cov$ar) /12
##
## 0 1 2 3 4 5
## 28 42 26 22 14 8
table(pred_cov$i) /12
##
## 1 2
## 107 33
table(pred_cov$ma) /12
##
## 0 1 2 3 4 5
## 11 48 38 22 12 9
ggplot(subset(pred_cov, set == 'd1'), aes(x = ar)) + geom_bar(stat = 'bin') + facet_grid(i ~ ma ) + theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Percent of significant ARIMA coefficients for each of the simulated scenarios. Either overall percent, or first summarized by genes or by samples.
sig <- ddply(pred_cov, .(set, type), summarize, mean = mean(sig))
ggplot(sig, aes(x = set, y = mean * 100, fill = type)) + geom_boxplot() + ylim(c(0, 100)) + ylab('Percent') + ggtitle('Significant coefs') + theme_bw()
sig <- ddply(pred_cov, .(set, sample, type), summarize, mean = mean(sig))
ggplot(sig, aes(x = set, y = mean * 100, fill = type)) + geom_boxplot(outlier.shape = 5) + ylim(c(0, 100)) + ylab('Percent: 7 samples') + ggtitle('Significant coefs: by genes') + geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 1/2) + theme_bw()
sig <- ddply(pred_cov, .(set, gene, type), summarize, mean = mean(sig))
ggplot(sig, aes(x = set, y = mean * 100, fill = type)) + geom_boxplot(outlier.shape = 5) + ylim(c(0, 100)) + ylab('Percent: 20 genes') + ggtitle('Significant coefs: by samples') + geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 1/2) + theme_bw()
Boxplots of ARIMA coefficient p-values. First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).
ggplot(pred_cov, aes(x = set, y = pval, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = pval, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = pval, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
Boxplots of ARIMA coefficients. First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).
ggplot(pred_cov, aes(x = set, y = coef, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = coef, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = coef, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
Relationship between p-values and coefficients for the ARIMA models.
ggplot(pred_cov, aes(x = pval, y = coef, color = type)) + geom_point() + facet_grid(group ~ .) + geom_vline(xintercept = 0.05, colour = 'red') + theme_bw()
ggplot(pred_cov, aes(x = pval, y = coef, color = type)) + geom_point() + facet_grid(group ~ sample) + geom_vline(xintercept = 0.05, colour = 'red') + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = pval, y = coef, color = type)) + geom_point() + facet_grid(group ~ gene) + geom_vline(xintercept = 0.05, colour = 'red') + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
Boxplots of root mean square distance where each vector has been standardized by its maximum. First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).
ggplot(pred_cov, aes(x = set, y = rmsd, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = rmsd, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = rmsd, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
Boxplots of correlation. First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).
ggplot(pred_cov, aes(x = set, y = cor, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = cor, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = cor, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
Comparison between correlation and RMSD-max.
ggplot(pred_cov, aes(x = rmsd, y = cor, color = type)) + geom_point() + facet_grid(group ~ .) + theme_bw()
ggplot(pred_cov, aes(x = rmsd, y = cor, color = type)) + geom_point() + facet_grid(group ~ sample) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = rmsd, y = cor, color = type)) + geom_point() + facet_grid(group ~ gene) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
Boxplots of \(R^2\) (correlation squared since its a simple linear model). First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).
ggplot(pred_cov, aes(x = set, y = R2, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = R2, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
ggplot(pred_cov, aes(x = set, y = R2, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, size=8))
## Reproducibility info
# Time spent in this report
diff(c(startTime, Sys.time()))
## Time difference of 57.65481 mins
Sys.time()
## [1] "2015-02-23 02:42:46 EST"
options(width = 120)
devtools::session_info()
## Session info -----------------------------------------------------------------------------------------------------------
## setting value
## version R version 3.1.1 Patched (2014-10-16 r66782)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz <NA>
## Packages ---------------------------------------------------------------------------------------------------------------
## package * version date source
## annotate * 1.44.0 2014-10-15 Bioconductor
## AnnotationDbi * 1.28.1 2014-10-29 Bioconductor
## ballgown 1.99.3 2015-02-20 Github (alyssafrazee/ballgown@7d8fb1c)
## base64enc * 0.1-2 2014-06-26 CRAN (R 3.1.0)
## BatchJobs * 1.5 2014-10-30 CRAN (R 3.1.1)
## BBmisc * 1.9 2015-02-03 CRAN (R 3.1.1)
## Biobase * 2.26.0 2014-10-15 Bioconductor
## BiocGenerics 0.12.1 2014-11-15 Bioconductor
## BiocParallel * 1.0.3 2015-02-09 Bioconductor
## Biostrings * 2.34.1 2014-12-13 Bioconductor
## bitops * 1.0-6 2013-08-17 CRAN (R 3.1.0)
## brew * 1.0-6 2011-04-13 CRAN (R 3.1.0)
## Cairo * 1.5-6 2014-06-26 CRAN (R 3.1.0)
## checkmate * 1.5.1 2014-12-14 CRAN (R 3.1.1)
## codetools * 0.2-9 2014-08-21 CRAN (R 3.1.1)
## colorspace * 1.2-4 2013-09-30 CRAN (R 3.1.0)
## DBI * 0.3.1 2014-09-24 CRAN (R 3.1.1)
## devtools * 1.7.0 2015-01-17 CRAN (R 3.1.1)
## digest * 0.6.8 2014-12-31 CRAN (R 3.1.1)
## evaluate * 0.5.5 2014-04-29 CRAN (R 3.1.0)
## fail * 1.2 2013-09-19 CRAN (R 3.1.0)
## foreach * 1.4.2 2014-04-11 CRAN (R 3.1.0)
## forecast 5.8 2015-01-06 CRAN (R 3.1.1)
## formatR * 1.0 2014-08-25 CRAN (R 3.1.1)
## fracdiff * 1.4-2 2012-12-02 CRAN (R 3.1.1)
## genefilter * 1.48.1 2014-10-17 Bioconductor
## GenomeInfoDb 1.2.4 2014-12-20 Bioconductor
## GenomicAlignments * 1.2.1 2014-11-05 Bioconductor
## GenomicRanges 1.18.4 2015-01-08 Bioconductor
## ggplot2 1.0.0 2014-05-21 CRAN (R 3.1.0)
## gtable * 0.1.2 2012-12-05 CRAN (R 3.1.0)
## htmltools * 0.2.6 2014-09-08 CRAN (R 3.1.1)
## IRanges 2.0.1 2014-12-13 Bioconductor
## iterators * 1.0.7 2014-04-11 CRAN (R 3.1.0)
## knitr * 1.9 2015-01-20 CRAN (R 3.1.1)
## labeling * 0.3 2014-08-23 CRAN (R 3.1.1)
## lattice * 0.20-29 2014-04-04 CRAN (R 3.1.1)
## limma * 3.22.4 2015-01-16 Bioconductor
## MASS * 7.3-35 2014-09-30 CRAN (R 3.1.1)
## Matrix * 1.1-4 2014-06-15 CRAN (R 3.1.1)
## mgcv * 1.8-3 2014-08-29 CRAN (R 3.1.1)
## munsell * 0.4.2 2013-07-11 CRAN (R 3.1.0)
## nlme * 3.1-118 2014-10-07 CRAN (R 3.1.1)
## nnet * 7.3-8 2014-03-28 CRAN (R 3.1.1)
## plyr 1.8.1 2014-02-26 CRAN (R 3.1.0)
## proto * 0.3-10 2012-12-22 CRAN (R 3.1.0)
## quadprog * 1.5-5 2013-04-17 CRAN (R 3.1.0)
## RColorBrewer * 1.1-2 2014-12-07 CRAN (R 3.1.1)
## Rcpp * 0.11.4 2015-01-24 CRAN (R 3.1.1)
## RCurl * 1.95-4.5 2014-12-06 CRAN (R 3.1.1)
## reshape2 * 1.4.1 2014-12-06 CRAN (R 3.1.1)
## rmarkdown 0.5.1 2015-01-26 CRAN (R 3.1.1)
## Rsamtools * 1.18.2 2014-11-12 Bioconductor
## RSQLite * 1.0.0 2014-10-25 CRAN (R 3.1.1)
## rstudioapi * 0.2 2014-12-31 CRAN (R 3.1.1)
## rtracklayer * 1.26.2 2014-11-12 Bioconductor
## S4Vectors 0.4.0 2014-10-15 Bioconductor
## scales * 0.2.4 2014-04-22 CRAN (R 3.1.0)
## sendmailR * 1.2-1 2014-09-21 CRAN (R 3.1.1)
## stringr * 0.6.2 2012-12-06 CRAN (R 3.1.0)
## survival * 2.37-7 2014-01-22 CRAN (R 3.1.1)
## sva * 3.12.0 2014-10-15 Bioconductor
## timeDate 3012.100 2015-01-23 CRAN (R 3.1.1)
## tseries * 0.10-33 2015-02-10 CRAN (R 3.1.1)
## usefulstuff 1.01 2015-02-19 Github (alyssafrazee/usefulstuff@b260fe9)
## XML * 3.98-1.1 2013-06-20 CRAN (R 3.1.0)
## xtable * 1.7-4 2014-09-12 CRAN (R 3.1.1)
## XVector * 0.6.0 2014-10-15 Bioconductor
## yaml * 2.1.13 2014-06-12 CRAN (R 3.1.1)
## zlibbioc * 1.12.0 2014-10-15 Bioconductor
## zoo 1.7-11 2014-02-27 CRAN (R 3.1.0)